set linesize 140
set rmsg on
set scheme s1mono
/******************************************************************************
	Project		:	Army Service in the All-Volunteer Era
	Author(s)	:	Kyle Greenberg	(kyle.greenberg@westpoint.edu)
					Matthew Gudgeon (matthew.gudgeon@westpoint.edu)
					Adam Isen 		(Adam.Isen@treasury.gov)
					Corbin Miller 	(Corbin.Miller@treasury.gov)
					Rich Patterson 	(rich_patterson@byu.edu)
	File Name	:	prep-data.do
	Description	:	Preps the data for dynamic analysis by merging relative
					year tax data to MEPCOM data.
*******************************************************************************/

*---- if not running master, set up file structure ----*
*change directory to where programs and subfolders are stored
*cd ""

if "${raw}"==""		global raw		"raw/"
if "${data}"==""	global data		"data/"
if "${output}"==""	global output	"output/"

cap mkdir	"${data}"

*----------------------------------------------*
*---- pull together and clean dynamic data ----*
*----------------------------------------------*
use "${raw}mepcom-treasury", clear
gen firstyear = year(firstafqtdt)
gen quarterFE = qofd(firstafqtdt)

*just keep what is necessary for the merges, then merge other necessary info back in at the end
keep pid ssn tin_anymatch firstafqt_fy firstyear firstafqt quarterFE access
compress

replace firstafqt_fy = 3000 if firstafqt_fy == 2004.75		

rename tin_anymatch tin
drop if missing(tin)

*---- bring in relative-year tax information ----*
forval y=-1/19 {
	local yt = `y'
	if `y' == -1 local yt m1
	gen tax_yr = firstyear+`y'

	*cpi
	merge m:1 tax_yr using "${raw}CPI-annual", nogen keep(1 3)
	
	*tax data outcomes
	merge 1:1 tin tax_yr using "${raw}tin-income", nogen keep(1 3) keepusing(wages f1099misc_inc anyf1098T married anyf1098 anyf1040 filing_status gov nonprofit militaryjob zip_income military armywages othermilwages nonmilwages profit_loss profit_loss_ontin profit_loss_se)
	gen anywages = wages>0 & !missing(wages)

	*nontaxable army earnings
	merge 1:1 pid tax_yr using "${raw}nontax", nogen keep(1 3) keepusing(tot_non_tax)

	*date of death
	merge m:1 ssn using "${raw}ssn_list", nogen keep(1 3) keepusing(ssa_dod)

	*---- self employment income ----*
	
	*1) - if 2007 or later, set to ssn matched value
	gen self = profit_loss if tax_yr>=2007
	*2) if 2006 or earlier and filing_status!=2, then set to tin matched value
	replace self = profit_loss_ontin if tax_yr<2007 & filing_status!=2
	*3) 2006 or earlier and filing_status==2, then set to se matched value if it is positive
	replace self = profit_loss_se if tax_yr<2007 & filing_status==2 & profit_loss_se>0
	*4) 2006 or earlier and filing_status==2 and se is 0 and tn matched value is negative, then set to 1/2 of schedule c tin matched value
	replace self=profit_loss_ontin/2 if tax_yr<2007 & filing_status==2 & inlist(profit_loss_se,0,.) & profit_loss_ontin<400
	*5) fill in missing with f1099misc earnings (if any)
	replace self = f1099misc_inc if filing_status==. & self==.
	gen anyself = self!=0 if !missing(self)
	replace anyself = 1/2 if anyself==1 & tax_yr<2007 & filing_status==2 & inlist(profit_loss_se,0,.) & profit_loss_ontin<400
	
	*---- mortality ----*
	gen death_year = floor(ssa_dod/10000) if ssa_dod!=0
	gen mortality = tax_yr>=death_year if !missing(tax_yr) & inrange(tax_yr,1990,2018) & firstyear<=death_year
	
	*---- adjust for inflation ----*
	foreach x in wages armywages othermilwages nonmilwages tot_non_tax self zip_income {
		replace `x' = `x'*251.1041667/cpiaucsl
	}
	
	*---- deal with missing information ----*
	foreach x in wages armywages othermilwages nonmilwages tot_non_tax self anywages anyf1098T anyf1098 military married anyf1040 anyself zip_income {
		replace `x' = 0 if `x'==. & inrange(tax_yr,1999,2018)
		replace `x' = . if !inrange(tax_yr,1999,2018)
	}
	
	*---- add nontaxable military income to wages ----*
	gen wages_plus = wages + tot_non_tax if tot_non_tax!=0
	*get ratio of nontaxable to taxable in the army to apply to other military
	gen ratio = tot_non_tax/armywages if armywages>0 & !missing(armywages)
	egen meanratio = mean(ratio), by(firstyear)
	*assign army values to wages_plus_os
	gen wages_plus_os = wages_plus
	*assign adjusted values for other military wages if no army wages
	replace wages_plus_os = (1+meanratio)*othermilwages + nonmilwages if othermilwages!=0 & missing(wages_plus_os)
	*fill in wages for those without any military earnings (so this will be = wages unless there are any military wages present)
	replace wages_plus_os = wages if missing(wages_plus_os)
	*finish filling in non-army wages
	replace wages_plus = wages if missing(wages_plus)
		
	rename wages rawwage
	rename wages_plus_os wages
	
	*winsorize income variables at 0 and 99th percentile
	foreach x in rawwage wages self zip_income {
		sum `x', d
		if "`x'"!="self" replace `x' = 0 if `x'<0
		if "`x'"=="self" replace `x' = r(p1) if `x'<r(p1)
		replace `x' = r(p99) if `x'>r(p99) & `x'!=.
	}

	*---- employment type (needs inflation-adjusted/winsorized wages) ----*
	
	*enrolled in post-secondary education and inflation-adjusted wages<$15000 
	gen edlt15 = anyf1098T==1 & wages<15000 if !missing(anywages)
	*active duty military is highest paying w-2 
	gen mil = militaryjob==1 & nonprofit==0 & anywages==1 & edlt15==0 if !missing(anywages)
	*other public sector is highest paying w-2
	gen public = gov==1 & nonprofit==0 & anywages==1 & mil!=1 & edlt15==0 if !missing(anywages)
	*other employment (not military or public sector) is highest paying w-2, 
	gen private = anywages==1 & mil!=1 & public!=1 & edlt15==0 if !missing(anywages)
	*not enrolled in post-secondary education and inflation-adjusted wages<$15000 
	gen noedlt15 = anyf1098T==0 & wages<15000 & !inlist(1,priv,mil,public,edlt15) if !missing(anywages)
	
	*---- counterfactuals ----*
	foreach x in wages anywages married anyf1098{
		gen cf_`x' = -`x'*(1-access)
	}
	gen cf_90_92 = -wages*(1-access) if inrange(firstyear,1990,1992)
	gen cf_93_95 = -wages*(1-access) if inrange(firstyear,1993,1995)
	gen cf_96_98 = -wages*(1-access) if inrange(firstyear,1996,1998)
	gen cf_99_01 = -wages*(1-access) if inrange(firstyear,1999,2001)
	gen cf_02_04 = -wages*(1-access) if inrange(firstyear,2002,2004)
	gen cf_05_08 = -wages*(1-access) if inrange(firstyear,2005,2008)
	
	*---- rename with relative year suffix ----*
	foreach x in wages self anyself rawwage anywages anyf1098T anyf1098 married military anyf1040 public zip_income mortality cf_wages cf_anywages cf_married cf_anyf1098 cf_90_92 cf_93_95 cf_96_98 cf_99_01 cf_02_04 cf_05_08{
		rename `x' `x'_`yt'
	}
	
	*---- remove unecessary variables ----*
	drop tax_yr cpiaucsl f1099misc_inc gov nonprofit militaryjob edlt15 mil priv noedlt15 profit_loss profit_loss_ontin profit_loss_se filing_status ratio meanratio armywages tot_non_tax othermilwages nonmilwages wages_plus death_year ssa_dod

	desc, fullnames
}

merge 1:1 pid using "${raw}mepcom-treasury", nogen keep(1 3) keepusing(years_served age_days firstafqt male white black hisp educ_lessthanhs educ_inhs educ_ged educ_hsdip educ_some_coll educ_coll_grad inst31 k31 instk31 inst50 k50 instk50 hor_state last_afqt_pctl)

gen instk31_2=instk31^2
gen k31_2 = k31^2
gen instk50_2=instk50^2
gen k50_2 = k50^2

gen all=1

compress
save "${data}army-treasury-analysis", replace

*-----------------------------------------------------*
*---- clean and prep data for cumulative outcomes ----*
*-----------------------------------------------------*
use "${data}army-treasury-analysis", clear

gen min_year_after_firstafqt=.
forval y=0/19 {
	replace anyf1098T_`y'=. if inrange(firstyear,1990,1998)
	replace min_year_after_firstafqt=`y' if wages_`y'!=. & min_year_after_firstafqt==.
}

gen max_year_after_firstafqt=.
forval y=19(-1)0 {
	replace max_year_after_firstafqt=`y' if wages_`y'!=. & max_year_after_firstafqt==.
}

gen yrs_observed_0_19 = max_year_after_firstafqt - min_year_after_firstafqt + 1
gen yrs_observed_11_19 = max(0,max_year_after_firstafqt-max(10,min_year_after_firstafqt))
replace yrs_observed_11_19=0 if yrs_observed_11_19==.
gen yrs_observed_15_19 = max(0,max_year_after_firstafqt-max(14,min_year_after_firstafqt))
replace yrs_observed_15_19=0 if yrs_observed_15_19==.

forval y=1990/2011 {
	disp `y'
	sum firstyear min_year_after_firstafqt max_year_after_firstafqt yrs_observed_* if firstyear==`y'
}

drop anyf1098T_m1
egen everanyf1098T = rowmax(anyf1098T*)

*identify tax year of last 1098-T receipt
gen max_1098_yr=0 if everanyf1098T==1
forvalues k=1/19{
	replace max_1098_yr=`k' if anyf1098T_`k'==1  
}
gen tax_yr=firstyear+max_1098_yr

*merge in school eins
merge 1:1 tin tax_yr using "${raw}tin-income", nogen keep(1 3) keepusing(school_ein1)

*keep first school in list
rename school_ein1 payer_tin

*merge in school quality values
merge m:1 payer_tin using "${raw}school-quality_nomil", nogen keep(1 3) keepusing(wage_mean)
gen school_q = wage_mean if everanyf1098T==1
preserve
	use "${data}noschool_mean_wage", clear
	sum wage
	local meanwage = r(mean)
restore
replace school_q = `meanwage' if school_q_imp==. & everanyf1098T==0

*create some alternative earnings and employment measures
forvalues y=0/19{
	gen lnwages_`y' = ln(wages_`y')
	gen alt_emp_`y' = wages_`y'>15000 & !missing(wages_`y')
}

foreach outcome in wages lnwages anywages anyf1098T alt_emp {
	egen `outcome'_0_19_mean = rowmean(`outcome'_? `outcome'_1?)
	egen `outcome'_11_19_mean = rowmean(`outcome'_11 `outcome'_12 `outcome'_13 `outcome'_14 `outcome'_15 `outcome'_16 `outcome'_17 `outcome'_18 `outcome'_19)
	egen `outcome'_15_19_mean = rowmean(`outcome'_15 `outcome'_16 `outcome'_17 `outcome'_18 `outcome'_19)
}

gen female = male==0 if !missing(male)
encode(hor_state), gen(app_state)
replace app_state=100 if missing(app_state)
sum
gen lo_educ = (educ_lessthanhs + educ_ged)
gen hi_educ = (educ_some_coll + educ_coll_grad)

compress

keep pid firstafqt_fy firstafqt ssn tin firstyear quarterFE access age_days male white black hisp educ_lessthanhs educ_inhs educ_ged educ_hsdip educ_some_coll educ_coll_grad hor_state inst31 k31 instk31 k31_2 instk31_2 inst50 k50 instk50 k50_2 instk50_2 min_year_after_firstafqt max_year_after_firstafqt yrs_observed_0_19 yrs_observed_11_19 yrs_observed_15_19 *_0_19_mean *_11_19_mean *_15_19_mean female app_state lo_educ hi_educ all everanyf1098T school_q

save "${data}cumulative-analysis", replace
